rm(list=ls())
require(quanteda)
require(LSX)
require(stringi)
source("functions.R")

local({
    load("data/YoungSoroka.2012.Rdata")
    YS$body <- stri_replace_all_regex(YS$body, "\\bx+", " ")
    YS$body <- stri_replace_all_regex(YS$body, "Copyright \\d{4} The New York Times Company", "")
    YS$body <- stri_replace_all_fixed(YS$body, "\\", "")
    YS$body <- stri_replace_all_fixed(YS$body, "''", "")
    score <- c("Negative" =  -1, "Neutral" = 0, "Positive" = 1)
    YS$code_1 <- score[YS$code_1] 
    YS$code_2 <- score[YS$code_2] 
    YS$code_3 <- score[YS$code_3]
    tb <- table(rep(YS$id, 3), unlist(YS[,c("code_1", "code_2", "code_3")]))
    tb <- tb[as.character(YS$id),]
    YS$human <- (apply(tb, 1, max) - 1) * (apply(tb, 1, which.max) - 2)
    corp <- corpus(YS, text_field = "body")
    saveRDS(corp, "data_corpus_manual_en.RDS")
})

local({
    dat <- readRDS("data/data_economy_nyt.RDS")
    dat$publication <- "nyt"
    dat$docid <- paste0("nyt_", seq_len(nrow(dat)))
    
    dat$year <- lubridate::year(dat$date)
    dat$month <- lubridate::month(dat$date)
    
    dat$doc_id <- dat$docid
    dat$docid <- factor(dat$docid, levels = unique(dat$docid))
    
    dat <- cbind(dat, diagnosys(dat$body))
    corp <- corpus(dat, text_field = "body")
    corp <- corpus_reshape(corp, "sentences")
    saveRDS(corp, "data_corpus_en.RDS")
})

local({
    
    dat <- read.csv("coding/sample.csv", stringsAsFactors = FALSE)
    dat$date <- as.Date(dat$date, format = "%m/%d/%Y")
    dat$year <- lubridate::year(dat$date)
    dat$month <- lubridate::month(dat$date)
    score <- c("Negative" =  -1, "Neutral" = 0, "Positive" = 1)
    dat$code_1 <- score[dat$code_1] 
    dat$code_2 <- score[dat$code_2] 
    dat$code_3 <- score[dat$code_3]
    tb <- table(rep(dat$doc_id, 3), unlist(dat[,c("code_1", "code_2", "code_3")]))
    tb <- tb[dat$doc_id,]
    dat$human <- (apply(tb, 1, max) - 1) * (apply(tb, 1, which.max) - 2)
    corp <- corpus(dat)
    saveRDS(corp, "data_corpus_manual_ja.RDS")
    
})

local({
    
    dat <- readRDS("data/data_politics_asahi.RDS")
    corp <- corpus(dat, text_field = "body")
    dat$publication <- "asahi"
    dat$docid <- paste0("asahi_", seq_len(nrow(dat)))
    
    dat$year <- lubridate::year(dat$date)
    dat$month <- lubridate::month(dat$date)
    
    dat$doc_id <- dat$docid
    dat$docid <- factor(dat$docid, levels = unique(dat$docid))
    
    dat$body <- stri_replace_all_regex(
        dat$body, "【(.+?)】|（(.+?)）", " "
    )
    dat$body <- stri_replace_all_regex(
        dat$body, "[\u25A0-\u25FF]", "。"
    )
    dat <- cbind(dat, diagnosys(dat$body))
    
    corp <- corpus(dat, text_field = "body")
    corp <- corpus_segment(corp, c('。'), valuetype = "fixed", 
                           extract_pattern = FALSE,
                           pattern_position = "after")
    saveRDS(corp, "data_corpus_ja.RDS")
    
})
